rm(list=ls())
require(quanteda)
require(quanteda.textstats)
require(stringi)
require(newsmap)
source("functions.R")
quanteda_options(threads = 8)

dict <- dictionary(file = "topic_candidate.yml")
toks <- readRDS("data/data_tokens_sent.RDS")  %>% tokens_subset(year >= 1991)
toks <- tokens_compound(toks, dict, join = FALSE, concatenator = " ")
mt <- dfm(toks, remove = c("", stopwords("en"))) %>% 
  dfm_trim(min_termfreq = 10)
mt_sub <- dfm_subset(mt, !is.na(topic_human) & ntoken(mt) > 0)
dat <- docvars(mt_sub)

# Random draw ============================================
local({
  lis_all <- quanteda:::flatten_dictionary(dict, levels = 2)
  simu_rand <- list()
  for (i in seq(50)) {
    cat(i, "\n")
    dat_bs <- data.frame()
    for (n in seq_len(7)) {
      lis_bs <- sample_list(lis_all, n)
      mt_bs <- dfm_lookup(mt, dictionary(lis_bs), capkeys = FALSE, levels = 1)
      map_bs <- textmodel_newsmap(mt, mt_bs)
      summ <- summary(accuracy(predict(map_bs, newdata = mt_sub), dat$topic_human))
      dat_temp <- data.frame(n, i,
                             d2 = get_coverage(mt_bs),
                             e2 = get_entropy(mt, mt_bs, smooth = 1),
                             p = summ[1], r = summ[2], P = summ[3], R = summ[4],
                             f1 = 2 * ((summ[1] * summ[2]) / (summ[1] + summ[2])),
                             stringsAsFactors = FALSE)

      dat_bs <- rbind(dat_bs, dat_temp)
      plot(dat_bs$d2, dat_bs$f1, col = dat_bs$n, cex = dat_bs$e1)
    }
    simu_rand[[i]] <- dat_bs
  }
  saveRDS(simu_rand, "data_simulation_random.RDS")
})

# Path ============================================

local({
  simu_path <- list()
  for (i in seq_len(100)) {
    cat(i, "\n")
    lis_all <- quanteda:::flatten_dictionary(dict, levels = 2)
    dat_bs <- data.frame()
    lis_bs <- quanteda:::flatten_dictionary(dict["knowledge"], levels = 2)
    attr(lis_bs, "value") <- NA
    attr(lis_bs, "key") <- NA
    for (n in seq_len(31)) {
      mt_bs <- dfm_lookup(mt, dictionary(lis_bs), capkeys = FALSE, levels = 1)
      map_bs <- textmodel_newsmap(mt, mt_bs)
      summ <- summary(accuracy(predict(map_bs, newdata = mt_sub), dat$topic_human))
      dat_temp <- data.frame(n, i, noise = FALSE, 
                             word_added = attr(lis_bs, "value"), 
                             topic_added = attr(lis_bs, "key"),
                             d2 = get_coverage(mt_bs), 
                             e2 = get_entropy(mt, mt_bs, smooth = 1),
                             p = summ[1], r = summ[2], P = summ[3], R = summ[4],
                             f1 = 2 * ((summ[1] * summ[2]) / (summ[1] + summ[2])),
                             stringsAsFactors = FALSE)
      if (nrow(dat_bs)) {
        cat(sprintf("%d Add %s: %s (%.3f)", n - 1,
                    stri_trans_toupper(dat_temp$topic_added), dat_temp$word_added, 
                    dat_temp$f1), "\n")
      }
      dat_bs <- rbind(dat_bs, dat_temp)
      plot_path2(dat_bs)
      lis_bs <- grow_list(lis_bs, lis_all)
      if (length(attr(lis_bs, "value")) == 0) break
    }
    simu_path[[i]] <- dat_bs
  }
  saveRDS(simu_path, "data_simulation_path.RDS")
})

# Selection ============================================
local({

  lis_init <- quanteda:::flatten_dictionary(dict["knowledge"], levels = 2)
  lis_init <- get_initial(lis_init)
  lis_add <- quanteda:::flatten_dictionary(dict["frequency"], levels = 2)
  
  mt_init <- dfm_lookup(mt, dictionary(lis_init), capkeys = FALSE, levels = 1)
  map_init <- textmodel_newsmap(mt, mt_init)
  summ <- summary(accuracy(predict(map_init, newdata = mt_sub), dat$topic_human))
  dat_bs <- data.frame(i = 0, j = 0, noise = FALSE, 
                         word_added = NA, 
                         topic_added = NA,
                         d2 = get_coverage(mt_init), 
                         e2 = get_entropy(mt, mt_init, smooth = 1),
                         p = summ[1], r = summ[2], P = summ[3], R = summ[4],
                         f1 = 2 * ((summ[1] * summ[2]) / (summ[1] + summ[2])),
                         stringsAsFactors = FALSE)
  for (i in seq_along(lis_add)) {
    for (j in seq_along(lis_add[[i]])) {
      lis_temp <- append_list(lis_init, lis_add, i, j)
      mt_bs <- dfm_lookup(mt, dictionary(lis_temp), capkeys = FALSE, levels = 1)
      map_bs <- textmodel_newsmap(mt, mt_bs)
      summ <- summary(accuracy(predict(map_bs, newdata = mt_sub), dat$topic_human))
      dat_temp <- data.frame(i = i, j = j, noise = FALSE, 
                             word_added = attr(lis_temp, "value"), 
                             topic_added = attr(lis_temp, "key"),
                             d2 = get_coverage(mt_bs), 
                             e2 = get_entropy(mt, mt_bs, smooth = 1),
                             p = summ[1], r = summ[2], P = summ[3], R = summ[4],
                             f1 = 2 * ((summ[1] * summ[2]) / (summ[1] + summ[2])),
                             stringsAsFactors = FALSE)
      cat(sprintf("%d %d Test %s: %s (%.3f, %.3f)", i, j,
                  stri_trans_toupper(dat_temp$topic_added), dat_temp$word_added, 
                  dat_temp$e2, dat_temp$f1), "\n")
      dat_bs <- rbind(dat_bs, dat_temp)
      plot(dat_bs$e2 - dat_bs$e2[1], dat_bs$f1, type = "n")
      abline(v = 0, lty = 3)
      text(dat_bs$e2 - dat_bs$e2[1], dat_bs$f1, dat_bs$word_added)
      grid()
    }
  }
  dat_bs$d2_diff <- (dat_bs$d2 - head(dat_bs$d2, 1)) / head(dat_bs$d2, 1)
  dat_bs$e2_diff <- (dat_bs$e2 - head(dat_bs$e2, 1)) / head(dat_bs$e2, 1)
  dat_bs$f1_diff <- (dat_bs$f1 - head(dat_bs$f1, 1)) / head(dat_bs$f1, 1)
  saveRDS(dat_bs, "data_simulation_selection.RDS")
})

# Summary ============================================

local({
  
  dict <- dictionary(file = "topic_candidate.yml")
  lis_all <- quanteda:::flatten_dictionary(dict, levels = 2)
  dat_cand <- readRDS("data_simulation_selection.RDS")
  
  dat_summ <- data.frame()
  
  dict_low <- dictionary(clean_list(lis_all, dat_cand$word_added[dat_cand$e2_diff > 0]))
  mt_low <- dfm_lookup(mt, dict_low, capkeys = FALSE, levels = 1)
  map_low <- textmodel_newsmap(mt, mt_low)
  summ_low <- c(summary(accuracy(predict(map_low, newdata = mt_sub), dat$topic_human)),
                n = length(unlist(dict_low)), d2 = get_coverage(mt_low))
  dat_summ <- rbind(dat_summ, "low e2" = as.list(summ_low))
  
  dict_high <- dictionary(clean_list(lis_all, dat_cand$word_added[dat_cand$e2_diff < 0]))
  mt_high <- dfm_lookup(mt, dict_high, capkeys = FALSE, levels = 1)
  map_high <- textmodel_newsmap(mt, mt_high)
  summ_high <- c(summary(accuracy(predict(map_high, newdata = mt_sub), dat$topic_human)),
                 n = length(unlist(dict_high)), d2 = get_coverage(mt_high))
  dat_summ <- rbind(dat_summ, "high e2" = as.list(summ_high))
  
  dict_all <- dictionary(lis_all)
  mt_all <- dfm_lookup(mt, dict_all, capkeys = FALSE, levels = 1)
  map_all <- textmodel_newsmap(mt, mt_all)
  summ_all <- c(summary(accuracy(predict(map_all, newdata = mt_sub), dat$topic_human)),
                n = length(unlist(lis_all)), d2 = get_coverage(mt_all))
  dat_summ <- rbind(dat_summ, all = as.list(summ_all))
  
  cat(as.yaml(dict_low), file = "topic_low-entropy.yml")
  cat(as.yaml(dict_high), file = "topic_high-entropy.yml")
  saveRDS(dat_summ, "data_simulation_summary.RDS")
})
